{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "2b6441a9",
      "metadata": {
        "id": "2b6441a9"
      },
      "source": [
        "# How to calculate Embeddings drift?"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1f683a69",
      "metadata": {
        "id": "1f683a69"
      },
      "outputs": [],
      "source": [
        "try:\n",
        "    import evidently\n",
        "except:\n",
        "    !pip install git+https://github.com/evidentlyai/evidently.git"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6a1899a8",
      "metadata": {
        "id": "6a1899a8"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "from sklearn import datasets\n",
        "\n",
        "from evidently import ColumnMapping\n",
        "from evidently.report import Report\n",
        "from evidently.metrics import EmbeddingsDriftMetric\n",
        "\n",
        "from evidently.metrics.data_drift.embedding_drift_methods import model, distance, ratio, mmd\n",
        "\n",
        "from evidently.tests import TestEmbeddingsDrift\n",
        "from evidently.test_preset import DataDriftTestPreset, NoTargetPerformanceTestPreset\n",
        "from evidently.test_suite import TestSuite"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "3764d56f",
      "metadata": {
        "id": "3764d56f"
      },
      "outputs": [],
      "source": [
        "#you might need to install pillow library to use datasets.fetch_lfw_people() from sklearn\n",
        "\n",
        "try:\n",
        "    import PIL\n",
        "except ImportError:\n",
        "    !pip install pillow"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "a27cdc8b",
      "metadata": {
        "id": "a27cdc8b"
      },
      "source": [
        "## Prepare a Dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "33d95ea1",
      "metadata": {
        "id": "33d95ea1"
      },
      "outputs": [],
      "source": [
        "embeddings_data = datasets.fetch_lfw_people()\n",
        "embeddings_data = pd.DataFrame(embeddings_data['data'])\n",
        "embeddings_data.columns = ['col_' + str(x) for x in embeddings_data.columns]\n",
        "\n",
        "embeddings_data = embeddings_data.iloc[:5100, :31]\n",
        "\n",
        "embeddings_data_shifted = embeddings_data.copy()\n",
        "embeddings_data_shifted.iloc[2500:5000, :5] = 0"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "4f659a33",
      "metadata": {
        "id": "4f659a33"
      },
      "source": [
        "## Embeddings Drift Report"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7c434afe",
      "metadata": {
        "id": "7c434afe"
      },
      "outputs": [],
      "source": [
        "column_mapping = ColumnMapping(\n",
        "    embeddings={'small_subset': embeddings_data.columns[:10], 'big_subset': embeddings_data.columns[10:29]},\n",
        "    target=embeddings_data.columns[30]\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a2a074da",
      "metadata": {
        "id": "a2a074da"
      },
      "outputs": [],
      "source": [
        "report = Report(metrics=[\n",
        "    EmbeddingsDriftMetric('small_subset')\n",
        "])\n",
        "\n",
        "report.run(reference_data = embeddings_data[:2500], current_data = embeddings_data[2500:5000], \n",
        "           column_mapping = column_mapping)\n",
        "report"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3a5b6b38",
      "metadata": {
        "id": "3a5b6b38"
      },
      "source": [
        "### Embeddings Drift Detection: model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7953ec07",
      "metadata": {
        "id": "7953ec07"
      },
      "outputs": [],
      "source": [
        "report = Report(metrics = [\n",
        "    EmbeddingsDriftMetric('small_subset', \n",
        "                          drift_method = model(\n",
        "                              threshold = 0.55,\n",
        "                              bootstrap = None,\n",
        "                              quantile_probability = 0.95,\n",
        "                              pca_components = None,\n",
        "                          )\n",
        "                         )\n",
        "])\n",
        "\n",
        "report.run(reference_data = embeddings_data_shifted[:2500], current_data = embeddings_data_shifted[2500:5000], \n",
        "           column_mapping = column_mapping)\n",
        "report"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "f38c62ec",
      "metadata": {
        "id": "f38c62ec"
      },
      "source": [
        "### Embeddings Drift Detection: mmd"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ecae5b1d",
      "metadata": {
        "id": "ecae5b1d"
      },
      "outputs": [],
      "source": [
        "report = Report(metrics = [\n",
        "    EmbeddingsDriftMetric('small_subset', \n",
        "                          drift_method = mmd(\n",
        "                              threshold = 0.015,\n",
        "                              bootstrap = None,\n",
        "                              quantile_probability = 0.95,\n",
        "                              pca_components = None,\n",
        "                          )\n",
        "                         )\n",
        "])\n",
        "\n",
        "report.run(reference_data = embeddings_data_shifted[:2500], current_data = embeddings_data_shifted[2500:5000],  \n",
        "           column_mapping = column_mapping)\n",
        "report"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "56a5e5b9",
      "metadata": {
        "id": "56a5e5b9"
      },
      "source": [
        "### Embeddings Drift Detection: ratio"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d3a4cb11",
      "metadata": {
        "id": "d3a4cb11"
      },
      "outputs": [],
      "source": [
        "report = Report(metrics = [\n",
        "    EmbeddingsDriftMetric('small_subset', \n",
        "                          drift_method = ratio(\n",
        "                              component_stattest = 'wasserstein',\n",
        "                              component_stattest_threshold = 0.1,\n",
        "                              threshold = 0.2,\n",
        "                              pca_components = None,\n",
        "                          )\n",
        "                         )\n",
        "])\n",
        "\n",
        "report.run(reference_data = embeddings_data_shifted[:2500], current_data = embeddings_data_shifted[2500:5000],  \n",
        "           column_mapping = column_mapping)\n",
        "report"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "29c0835f",
      "metadata": {
        "id": "29c0835f"
      },
      "source": [
        "### Embeddings Drift Detection: distance"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "511be378",
      "metadata": {
        "id": "511be378"
      },
      "outputs": [],
      "source": [
        "report = Report(metrics = [\n",
        "    EmbeddingsDriftMetric('small_subset', \n",
        "                          drift_method = distance(\n",
        "                              dist = 'euclidean', #\"euclidean\", \"cosine\", \"cityblock\" or \"chebyshev\"\n",
        "                              threshold = 0.2,\n",
        "                              pca_components = None,\n",
        "                              bootstrap = None,\n",
        "                              quantile_probability = 0.95\n",
        "                          )\n",
        "                         )\n",
        "])\n",
        "\n",
        "report.run(reference_data = embeddings_data_shifted[:2500], current_data = embeddings_data_shifted[2500:5000],  \n",
        "           column_mapping = column_mapping)\n",
        "report"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Data Drift Metric Preset"
      ],
      "metadata": {
        "id": "kV1ois0rlwPE"
      },
      "id": "kV1ois0rlwPE"
    },
    {
      "cell_type": "code",
      "source": [
        "from evidently.metric_preset import DataDriftPreset\n",
        "from evidently.metrics.data_drift.embedding_drift_methods import model, ratio\n",
        "report = Report(metrics=[\n",
        "    DataDriftPreset(embeddings=['small_subset', 'big_subset'],\n",
        "                    embeddings_drift_method={'small_subset': ratio(), 'big_subset': ratio(pca_components=5)})\n",
        "])\n",
        "\n",
        "report.run(reference_data=embeddings_data[:2500],\n",
        "    current_data=embeddings_data[2500:5000], \n",
        "    column_mapping=column_mapping)\n",
        "report"
      ],
      "metadata": {
        "id": "sKs4qr5rl1Ke"
      },
      "id": "sKs4qr5rl1Ke",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Embeddings Drift Test"
      ],
      "metadata": {
        "id": "M8NMmbHCmI2T"
      },
      "id": "M8NMmbHCmI2T"
    },
    {
      "cell_type": "code",
      "source": [
        "tests = TestSuite(tests=[\n",
        "    TestEmbeddingsDrift(embeddings_name='small_subset')\n",
        "])\n",
        "\n",
        "tests.run(reference_data=embeddings_data[:2500],\n",
        "    current_data=embeddings_data[2500:5000], \n",
        "    column_mapping=column_mapping)\n",
        "tests"
      ],
      "metadata": {
        "id": "4OOyI0LYmMlK"
      },
      "id": "4OOyI0LYmMlK",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Data Drift Test Preset"
      ],
      "metadata": {
        "id": "i6yd3aizmXkT"
      },
      "id": "i6yd3aizmXkT"
    },
    {
      "cell_type": "code",
      "source": [
        "tests = TestSuite(tests=[\n",
        "    DataDriftTestPreset(embeddings=['small_subset'])\n",
        "])\n",
        "\n",
        "tests.run(\n",
        "    reference_data=embeddings_data[:2500],\n",
        "    current_data=embeddings_data[2500:5000], \n",
        "    column_mapping=column_mapping\n",
        ")\n",
        "tests"
      ],
      "metadata": {
        "id": "xesoGy28meUV"
      },
      "id": "xesoGy28meUV",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# No Target Performance Test Preset"
      ],
      "metadata": {
        "id": "qDPR7vEZmlX5"
      },
      "id": "qDPR7vEZmlX5"
    },
    {
      "cell_type": "code",
      "source": [
        "tests = TestSuite(tests=[\n",
        "    NoTargetPerformanceTestPreset(embeddings=['small_subset', 'big_subset'])\n",
        "])\n",
        "\n",
        "tests.run(\n",
        "    reference_data=embeddings_data[:2500],\n",
        "    current_data=embeddings_data[2500:5000], \n",
        "    column_mapping=column_mapping\n",
        ")\n",
        "tests"
      ],
      "metadata": {
        "id": "rZBXQIxSmppS"
      },
      "id": "rZBXQIxSmppS",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.16"
    },
    "colab": {
      "provenance": []
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}